[LAA] Support assumptions in evaluatePtrAddRecAtMaxBTCWillNotWrap #147047

fhahn · 2025-07-04T12:52:06Z

This patch extends the logic added in
#128061 to support dereferenceability information from assumptions as well.

Unfortunately both assumption cache and the dominator tree need to be threaded through multiple layers to make them available where needed.

llvmbot · 2025-07-04T12:52:36Z

@llvm/pr-subscribers-llvm-analysis

Author: Florian Hahn (fhahn)

Changes

This patch extends the logic added in
#128061 to support dereferenceability information from assumptions as well.

Unfortunately both assumption cache and the dominator tree need to be threaded through multiple layers to make them available where needed.

Patch is 20.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147047.diff

8 Files Affected:

(modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (+17-6)
(modified) llvm/lib/Analysis/Loads.cpp (+1-1)
(modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+48-24)
(modified) llvm/lib/Transforms/Scalar/LoopFlatten.cpp (+2-1)
(modified) llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp (+1-1)
(modified) llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll (+2-2)
(modified) llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll (+31-4)
(modified) llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp (+2-1)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 1faf279ae2012..7df31d366970e 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -180,10 +180,12 @@ class MemoryDepChecker {
                         const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
+                   DominatorTree *DT, const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
                    unsigned MaxTargetVectorWidthInBits)
-      : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
+      : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
+        SymbolicStrides(SymbolicStrides),
         MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
 
   /// Register the location (instructions are given increasing numbers)
@@ -288,6 +290,9 @@ class MemoryDepChecker {
     return PointerBounds;
   }
 
+  AssumptionCache *getAC() const { return AC; }
+  DominatorTree *getDT() const { return DT; }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
@@ -296,6 +301,10 @@ class MemoryDepChecker {
   /// example we might assume a unit stride for a pointer in order to prove
   /// that a memory access is strided and doesn't wrap.
   PredicatedScalarEvolution &PSE;
+
+  AssumptionCache *AC;
+  DominatorTree *DT;
+
   const Loop *InnermostLoop;
 
   /// Reference to map of pointer values to
@@ -669,7 +678,7 @@ class LoopAccessInfo {
   LLVM_ABI LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                           const TargetTransformInfo *TTI,
                           const TargetLibraryInfo *TLI, AAResults *AA,
-                          DominatorTree *DT, LoopInfo *LI,
+                          DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,
                           bool AllowPartial = false);
 
   /// Return true we can analyze the memory accesses in the loop and there are
@@ -921,7 +930,8 @@ LLVM_ABI std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    AssumptionCache *AC, DominatorTree *DT);
 
 class LoopAccessInfoManager {
   /// The cache.
@@ -934,12 +944,13 @@ class LoopAccessInfoManager {
   LoopInfo &LI;
   TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
+  AssumptionCache *AC;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
                         LoopInfo &LI, TargetTransformInfo *TTI,
-                        const TargetLibraryInfo *TLI)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}
+                        const TargetLibraryInfo *TLI, AssumptionCache *AC)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {}
 
   LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false);
 
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 880249588f0b2..7b4e00b298657 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -326,7 +326,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
     return false;
 
   const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr);
+      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, AC, &DT);
   if (isa<SCEVCouldNotCompute>(AccessStart) ||
       isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 007ee3cf01502..d254d1dab1d04 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -23,6 +23,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -208,28 +210,50 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
 
 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
 /// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
-                                                 const SCEV *MaxBTC,
-                                                 const SCEV *EltSize,
-                                                 ScalarEvolution &SE,
-                                                 const DataLayout &DL) {
+static bool
+evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
+                                     const SCEV *MaxBTC, const SCEV *EltSize,
+                                     ScalarEvolution &SE, const DataLayout &DL,
+                                     AssumptionCache *AC, DominatorTree *DT) {
   auto *PointerBase = SE.getPointerBase(AR->getStart());
   auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
   if (!StartPtr)
     return false;
+  const Loop *L = AR->getLoop();
   bool CheckForNonNull, CheckForFreed;
   uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes(
       DL, CheckForNonNull, CheckForFreed);
 
-  if (CheckForNonNull || CheckForFreed)
+  if (DerefBytes && (CheckForNonNull || CheckForFreed))
     return false;
 
   const SCEV *Step = AR->getStepRecurrence(SE);
+  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
+  const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
+
+  // Check if we have a suitable dereferencable assumption we can use.
+  RetainedKnowledge DerefRK;
+  if (getKnowledgeForValue(
+          StartPtr->getValue(), {Attribute::Dereferenceable}, *AC,
+          [&](RetainedKnowledge RK, Instruction *Assume, auto) {
+            if (!isValidAssumeForContext(
+                    Assume, L->getLoopPredecessor()->getTerminator(), DT))
+              return false;
+            if (RK.AttrKind == Attribute::Dereferenceable) {
+              DerefRK = std::max(DerefRK, RK);
+              return true;
+            }
+            return false;
+          }) &&
+      DerefRK.ArgValue) {
+    DerefBytesSCEV = SE.getUMaxExpr(DerefBytesSCEV,
+                                    SE.getConstant(WiderTy, DerefRK.ArgValue));
+  }
+
   bool IsKnownNonNegative = SE.isKnownNonNegative(Step);
   if (!IsKnownNonNegative && !SE.isKnownNegative(Step))
     return false;
 
-  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
   Step = SE.getNoopOrSignExtend(Step, WiderTy);
   MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy);
 
@@ -256,8 +280,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
     const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
     if (!EndBytes)
       return false;
-    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes,
-                               SE.getConstant(WiderTy, DerefBytes));
+    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
   }
 
   // For negative steps check if
@@ -265,15 +288,15 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
   //  * StartOffset <= DerefBytes.
   assert(SE.isKnownNegative(Step) && "must be known negative");
   return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) &&
-         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset,
-                             SE.getConstant(WiderTy, DerefBytes));
+         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV);
 }
 
 std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    AssumptionCache *AC, DominatorTree *DT) {
   std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
   if (PointerBounds) {
     auto [Iter, Ins] = PointerBounds->insert(
@@ -308,8 +331,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
       // sets ScEnd to the maximum unsigned value for the type. Note that LAA
       // separately checks that accesses cannot not wrap, so unsigned max
       // represents an upper bound.
-      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE,
-                                               DL)) {
+      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
+                                               AC, DT)) {
         ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
       } else {
         ScEnd = SE->getAddExpr(
@@ -356,9 +379,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     bool NeedsFreeze) {
   const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
   const SCEV *BTC = PSE.getBackedgeTakenCount();
-  const auto &[ScStart, ScEnd] =
-      getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC,
-                              PSE.getSE(), &DC.getPointerBounds());
+  const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
+      Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
+      &DC.getPointerBounds(), DC.getAC(), DC.getDT());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -2011,10 +2034,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
     const auto &[SrcStart_, SrcEnd_] =
         getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
+                                PSE.getSE(), &PointerBounds, AC, DT);
     const auto &[SinkStart_, SinkEnd_] =
         getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
+                                PSE.getSE(), &PointerBounds, AC, DT);
     if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
         !isa<SCEVCouldNotCompute>(SrcEnd_) &&
         !isa<SCEVCouldNotCompute>(SinkStart_) &&
@@ -3015,7 +3038,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI,
-                               bool AllowPartial)
+                               AssumptionCache *AC, bool AllowPartial)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
   unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3025,8 +3048,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
     MaxTargetVectorWidthInBits =
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
-  DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
-                                                  MaxTargetVectorWidthInBits);
+  DepChecker = std::make_unique<MemoryDepChecker>(
+      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
@@ -3095,7 +3118,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
   // or if it was created with a different value of AllowPartial.
   if (Inserted || It->second->hasAllowPartial() != AllowPartial)
     It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
-                                                  &LI, AllowPartial);
+                                                  &LI, AC, AllowPartial);
 
   return *It->second;
 }
@@ -3138,7 +3161,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &LI = FAM.getResult<LoopAnalysis>(F);
   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index f3e992c039178..b1096ce5ddd9f 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
+  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
+                             nullptr);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 4f2bfb073bafa..8e2cf832024ae 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, nullptr);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 1dc8d4a7e73f8..a942a0e35830f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -518,10 +518,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_kno
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
 ; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 0fe893abec86c..c42b4f66da27b 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -7,21 +7,48 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ -1, [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 118bf67320a3b..c365c95da6bff 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -41,7 +41,8 @@ class VPlanSlpTest : public VPlanTestIRBase {
     AARes.reset(new AAResults(*TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(new LoopAccessInf...
[truncated]

llvmbot · 2025-07-04T12:52:37Z

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

Changes

This patch extends the logic added in
#128061 to support dereferenceability information from assumptions as well.

Unfortunately both assumption cache and the dominator tree need to be threaded through multiple layers to make them available where needed.

Patch is 20.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147047.diff

8 Files Affected:

(modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (+17-6)
(modified) llvm/lib/Analysis/Loads.cpp (+1-1)
(modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+48-24)
(modified) llvm/lib/Transforms/Scalar/LoopFlatten.cpp (+2-1)
(modified) llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp (+1-1)
(modified) llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll (+2-2)
(modified) llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll (+31-4)
(modified) llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp (+2-1)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 1faf279ae2012..7df31d366970e 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -180,10 +180,12 @@ class MemoryDepChecker {
                         const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
+                   DominatorTree *DT, const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
                    unsigned MaxTargetVectorWidthInBits)
-      : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
+      : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
+        SymbolicStrides(SymbolicStrides),
         MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
 
   /// Register the location (instructions are given increasing numbers)
@@ -288,6 +290,9 @@ class MemoryDepChecker {
     return PointerBounds;
   }
 
+  AssumptionCache *getAC() const { return AC; }
+  DominatorTree *getDT() const { return DT; }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
@@ -296,6 +301,10 @@ class MemoryDepChecker {
   /// example we might assume a unit stride for a pointer in order to prove
   /// that a memory access is strided and doesn't wrap.
   PredicatedScalarEvolution &PSE;
+
+  AssumptionCache *AC;
+  DominatorTree *DT;
+
   const Loop *InnermostLoop;
 
   /// Reference to map of pointer values to
@@ -669,7 +678,7 @@ class LoopAccessInfo {
   LLVM_ABI LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                           const TargetTransformInfo *TTI,
                           const TargetLibraryInfo *TLI, AAResults *AA,
-                          DominatorTree *DT, LoopInfo *LI,
+                          DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,
                           bool AllowPartial = false);
 
   /// Return true we can analyze the memory accesses in the loop and there are
@@ -921,7 +930,8 @@ LLVM_ABI std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    AssumptionCache *AC, DominatorTree *DT);
 
 class LoopAccessInfoManager {
   /// The cache.
@@ -934,12 +944,13 @@ class LoopAccessInfoManager {
   LoopInfo &LI;
   TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
+  AssumptionCache *AC;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
                         LoopInfo &LI, TargetTransformInfo *TTI,
-                        const TargetLibraryInfo *TLI)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}
+                        const TargetLibraryInfo *TLI, AssumptionCache *AC)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {}
 
   LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false);
 
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 880249588f0b2..7b4e00b298657 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -326,7 +326,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
     return false;
 
   const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr);
+      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, AC, &DT);
   if (isa<SCEVCouldNotCompute>(AccessStart) ||
       isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 007ee3cf01502..d254d1dab1d04 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -23,6 +23,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -208,28 +210,50 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
 
 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
 /// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
-                                                 const SCEV *MaxBTC,
-                                                 const SCEV *EltSize,
-                                                 ScalarEvolution &SE,
-                                                 const DataLayout &DL) {
+static bool
+evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
+                                     const SCEV *MaxBTC, const SCEV *EltSize,
+                                     ScalarEvolution &SE, const DataLayout &DL,
+                                     AssumptionCache *AC, DominatorTree *DT) {
   auto *PointerBase = SE.getPointerBase(AR->getStart());
   auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
   if (!StartPtr)
     return false;
+  const Loop *L = AR->getLoop();
   bool CheckForNonNull, CheckForFreed;
   uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes(
       DL, CheckForNonNull, CheckForFreed);
 
-  if (CheckForNonNull || CheckForFreed)
+  if (DerefBytes && (CheckForNonNull || CheckForFreed))
     return false;
 
   const SCEV *Step = AR->getStepRecurrence(SE);
+  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
+  const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
+
+  // Check if we have a suitable dereferencable assumption we can use.
+  RetainedKnowledge DerefRK;
+  if (getKnowledgeForValue(
+          StartPtr->getValue(), {Attribute::Dereferenceable}, *AC,
+          [&](RetainedKnowledge RK, Instruction *Assume, auto) {
+            if (!isValidAssumeForContext(
+                    Assume, L->getLoopPredecessor()->getTerminator(), DT))
+              return false;
+            if (RK.AttrKind == Attribute::Dereferenceable) {
+              DerefRK = std::max(DerefRK, RK);
+              return true;
+            }
+            return false;
+          }) &&
+      DerefRK.ArgValue) {
+    DerefBytesSCEV = SE.getUMaxExpr(DerefBytesSCEV,
+                                    SE.getConstant(WiderTy, DerefRK.ArgValue));
+  }
+
   bool IsKnownNonNegative = SE.isKnownNonNegative(Step);
   if (!IsKnownNonNegative && !SE.isKnownNegative(Step))
     return false;
 
-  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
   Step = SE.getNoopOrSignExtend(Step, WiderTy);
   MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy);
 
@@ -256,8 +280,7 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
     const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
     if (!EndBytes)
       return false;
-    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes,
-                               SE.getConstant(WiderTy, DerefBytes));
+    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
   }
 
   // For negative steps check if
@@ -265,15 +288,15 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
   //  * StartOffset <= DerefBytes.
   assert(SE.isKnownNegative(Step) && "must be known negative");
   return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) &&
-         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset,
-                             SE.getConstant(WiderTy, DerefBytes));
+         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV);
 }
 
 std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    AssumptionCache *AC, DominatorTree *DT) {
   std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
   if (PointerBounds) {
     auto [Iter, Ins] = PointerBounds->insert(
@@ -308,8 +331,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
       // sets ScEnd to the maximum unsigned value for the type. Note that LAA
       // separately checks that accesses cannot not wrap, so unsigned max
       // represents an upper bound.
-      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE,
-                                               DL)) {
+      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
+                                               AC, DT)) {
         ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
       } else {
         ScEnd = SE->getAddExpr(
@@ -356,9 +379,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     bool NeedsFreeze) {
   const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
   const SCEV *BTC = PSE.getBackedgeTakenCount();
-  const auto &[ScStart, ScEnd] =
-      getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC,
-                              PSE.getSE(), &DC.getPointerBounds());
+  const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
+      Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
+      &DC.getPointerBounds(), DC.getAC(), DC.getDT());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -2011,10 +2034,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
     const auto &[SrcStart_, SrcEnd_] =
         getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
+                                PSE.getSE(), &PointerBounds, AC, DT);
     const auto &[SinkStart_, SinkEnd_] =
         getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC,
-                                PSE.getSE(), &PointerBounds);
+                                PSE.getSE(), &PointerBounds, AC, DT);
     if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
         !isa<SCEVCouldNotCompute>(SrcEnd_) &&
         !isa<SCEVCouldNotCompute>(SinkStart_) &&
@@ -3015,7 +3038,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI,
-                               bool AllowPartial)
+                               AssumptionCache *AC, bool AllowPartial)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
   unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3025,8 +3048,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
     MaxTargetVectorWidthInBits =
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
-  DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
-                                                  MaxTargetVectorWidthInBits);
+  DepChecker = std::make_unique<MemoryDepChecker>(
+      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
@@ -3095,7 +3118,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
   // or if it was created with a different value of AllowPartial.
   if (Inserted || It->second->hasAllowPartial() != AllowPartial)
     It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
-                                                  &LI, AllowPartial);
+                                                  &LI, AC, AllowPartial);
 
   return *It->second;
 }
@@ -3138,7 +3161,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &LI = FAM.getResult<LoopAnalysis>(F);
   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index f3e992c039178..b1096ce5ddd9f 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
+  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
+                             nullptr);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 4f2bfb073bafa..8e2cf832024ae 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, nullptr);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 1dc8d4a7e73f8..a942a0e35830f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -518,10 +518,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_kno
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
 ; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 0fe893abec86c..c42b4f66da27b 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -7,21 +7,48 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ -1, [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 118bf67320a3b..c365c95da6bff 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -41,7 +41,8 @@ class VPlanSlpTest : public VPlanTestIRBase {
     AARes.reset(new AAResults(*TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(new LoopAccessInf...
[truncated]

fhahn · 2025-07-04T13:11:08Z

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

+  AssumptionCache *AC;
+  DominatorTree *DT;


An alternative would be to retrieve them directly from ScalarEvolution, which holds them already, but it's not accessible at the moment. Not sure if we should expose them to use more conveniently here in the patch

david-arm

Thanks for this! Just a couple of comments ...

david-arm · 2025-07-04T13:11:53Z

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+
+  // Check if we have a suitable dereferencable assumption we can use.
+  RetainedKnowledge DerefRK;
+  if (!StartPtrV->canBeFreed() &&


I think this could be simplified by just doing:

if (!StartPtrV->canBeFreed()) { RetainedKnowledge DerefRK = getKnowledgeValidInContext(StartPtrV, {Attribute::Dereferenceable}, *AC, L->getLoopPredecessor()->getTerminator(), DT); if (!DerefRK) return false; DerefRK = std::max(DerefRK, RK); ... }

Updated, thanks.

david-arm · 2025-07-04T13:13:46Z

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+          StartPtrV, {Attribute::Dereferenceable}, *AC,
+          [&](RetainedKnowledge RK, Instruction *Assume, auto) {
+            if (!isValidAssumeForContext(
+                    Assume, L->getLoopPredecessor()->getTerminator(), DT))


Why is it safe to assume that the loop predecessor is a good enough context for the assume? Couldn't the assumption be broken in the loop? I was expecting to see the context instruction here being the actual pointer corresponding to the thing that could potentially wrap, i.e. %gep = getelementptr ...

The assumption must hold throughout the loop. The only way the assumption can be invalidated is by feeing the memory between assumption and use. For now, we require that the pointer cannot be freed throughout the whole function

artagnon · 2025-07-07T09:20:09Z

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

-                          DominatorTree *DT, LoopInfo *LI,
+                          DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,


Can have a default = nullptr for DT, LI, AC?

Sure done thanks

david-arm

Was going to accept the patch, but there are failing Linux and Windows builds so I'll wait until they're passing. Thanks!

david-arm · 2025-07-08T08:26:38Z

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+          DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue));
+    }
+  }
+


nit: Might be worth bailing out early if DerefBytesSCEV is zero, which will be quite common I think. I suspect in most cases the pointer won't be marked as dereferenceable or have an assumption, in which case getPointerDereferenceableBytes returns 0.

Sounds good, updated, thanks!

This patch extends the logic added in llvm#128061 to support dereferenceability information from assumptions as well. Unfortunately both assumption cache and the dominator tree need to be threaded through multiple layers to make them available where needed.

fhahn

Test failures should be gone after the latest rebase/update, thanks

fhahn · 2025-07-08T11:42:10Z

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+          DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue));
+    }
+  }
+


Sounds good, updated, thanks!

annamthomas · 2025-07-09T21:03:31Z

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+
+  // Check if we have a suitable dereferencable assumption we can use.
+  if (!StartPtrV->canBeFreed()) {
+    RetainedKnowledge DerefRK = getKnowledgeValidInContext(


There is a lot of similarity between this and the isDereferenceableAndAlignedPointerViaAssumption.
In the latter API, we check canBeFreed but use getKnowledgeForValue with the CtX passed through lambda isValidAssumeForContext(Assume, CtxI, DT). Note that we bail out in absence of CtX there, but we could technically use the Loop predecessor's terminator there (since the pointer is not freed through entire function).

Can we common this logic across both patches (#128436)? The difference is we check alignment assumptions as well in that API.

Sounds good, I'll first #128436 and then update this PR

annamthomas · 2025-07-15T15:47:11Z

The conflict is from 9693056. It uses getStartAndEndForAccess which also requires AC and DT, but is available at that level. I've resolved it locally for testing with early-exit vectorization.

However, with latest trunk and applying this change on top, we cannot vectorize an early-exit loop (with dereferenceable bundles with assumes) we previously could. The difference was in "old trunk", I did not have the change: #128061.

It came down to the limitation mentioned here:
#128436 (comment)

Investigating what needs to be done.

annamthomas · 2025-07-15T19:51:22Z

Investigating what needs to be done.

In my case, even with this patch, we bail out because AccessEnd is inttoptr (i64 -1 to ptr addrspace(1)). There are couple of improvements we'll need for the evaluatePtrAddRecAtMaxBTCWillNotWrap API.

However, it is orthogonal to this patch.

fhahn requested review from preames, Meinersbur, annamthomas and david-arm July 4, 2025 12:52

llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Jul 4, 2025

fhahn mentioned this pull request Jul 4, 2025

[Loads] Support dereferenceable assumption with variable size. #128436

Merged

fhahn force-pushed the laa-deref-assumption branch from 5e41879 to 1cfb0c2 Compare July 4, 2025 13:03

fhahn commented Jul 4, 2025

View reviewed changes

david-arm reviewed Jul 4, 2025

View reviewed changes

artagnon reviewed Jul 7, 2025

View reviewed changes

fhahn force-pushed the laa-deref-assumption branch from 1cfb0c2 to 064092f Compare July 7, 2025 20:23

david-arm reviewed Jul 8, 2025

View reviewed changes

fhahn added 3 commits July 8, 2025 12:04

!fixup address comments, thanks

8db1162

!fixup pass AC when needed.

e755778

fhahn force-pushed the laa-deref-assumption branch from 064092f to e755778 Compare July 8, 2025 11:32

!fixup exit early on zero

d00836a

fhahn commented Jul 8, 2025

View reviewed changes

annamthomas reviewed Jul 9, 2025

View reviewed changes

		DominatorTree DT, LoopInfo LI,
		DominatorTree DT, LoopInfo LI, AssumptionCache *AC,

[LAA] Support assumptions in evaluatePtrAddRecAtMaxBTCWillNotWrap #147047

Are you sure you want to change the base?

[LAA] Support assumptions in evaluatePtrAddRecAtMaxBTCWillNotWrap #147047

Uh oh!

Conversation

fhahn commented Jul 4, 2025

Uh oh!

llvmbot commented Jul 4, 2025

Uh oh!

llvmbot commented Jul 4, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

annamthomas commented Jul 15, 2025

Uh oh!

annamthomas commented Jul 15, 2025

Uh oh!

Uh oh!